{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tokens: ['artifici', 'intellig', 'found', 'academ', 'disciplin', '1956']\n"
     ]
    }
   ],
   "source": [
    "from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer\n",
    "from pymilvus.model.sparse import BM25EmbeddingFunction\n",
    "\n",
    "# there are some built-in analyzers for several languages, now we use 'en' for English.\n",
    "analyzer = build_default_analyzer(language=\"en\")\n",
    "\n",
    "corpus = [\n",
    "    \"Artificial intelligence was founded as an academic discipline in 1956.\",\n",
    "    \"Alan Turing was the first person to conduct substantial research in AI.\",\n",
    "    \"Born in Maida Vale, London, Turing was raised in southern England.\",\n",
    "]\n",
    "\n",
    "# analyzer can tokenize the text into tokens\n",
    "tokens = analyzer(corpus[0])\n",
    "print(\"tokens:\", tokens)\n",
    "\n",
    "# 使用corpus的内容训练一个bm25的模型，用来后面的生成稀疏向量\n",
    "bm25_ef = BM25EmbeddingFunction(analyzer)\n",
    "bm25_ef.fit(corpus)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Embeddings:\n",
      "   (0, 0)\t1.0208816528320312\n",
      "  (0, 1)\t1.0208816528320312\n",
      "  (0, 3)\t1.0208816528320312\n",
      "  (0, 5)\t1.0208816528320312\n",
      "  (1, 0)\t0.960698664188385\n",
      "  (1, 1)\t0.960698664188385\n",
      "  (1, 6)\t0.960698664188385\n",
      "  (1, 7)\t0.960698664188385\n",
      "  (1, 10)\t0.960698664188385\n",
      "  (1, 12)\t0.960698664188385\n",
      "  (2, 7)\t0.907216489315033\n",
      "  (2, 15)\t0.907216489315033\n",
      "  (2, 16)\t0.907216489315033\n",
      "  (2, 17)\t0.907216489315033\n",
      "  (2, 19)\t0.907216489315033\n",
      "  (2, 20)\t0.907216489315033\n",
      "  (3, 0)\t1.089108943939209\n",
      "  (3, 1)\t1.089108943939209\n",
      "  (3, 5)\t1.089108943939209\n",
      "  (4, 7)\t0.960698664188385\n",
      "  (4, 15)\t0.960698664188385\n",
      "  (4, 16)\t0.960698664188385\n",
      "  (4, 17)\t0.960698664188385\n",
      "  (4, 20)\t0.960698664188385\n",
      "Sparse dim: 21 (1, 21)\n"
     ]
    }
   ],
   "source": [
    "docs = [\n",
    "    \"The field of artificial intelligence was established as an academic subject in 1956.\",\n",
    "    \"Alan Turing was the pioneer in conducting significant research in artificial intelligence.\",\n",
    "    \"Originating in Maida Vale, London, Turing grew up in the southern regions of England.\",\n",
    "    \"In 1956, artificial intelligence emerged as a scholarly field.\",\n",
    "    \"Turing, originally from Maida Vale, London, was brought up in the south of England.\"\n",
    "]\n",
    "\n",
    "# Create embeddings for the documents\n",
    "docs_embeddings = bm25_ef.encode_documents(docs)\n",
    "\n",
    "# Print embeddings\n",
    "print(\"Embeddings:\\n\", docs_embeddings)\n",
    "# Since the output embeddings are in a 2D csr_array format, we convert them to a list for easier manipulation.\n",
    "print(\"Sparse dim:\", bm25_ef.dim, list(docs_embeddings)[0].shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
